#title: "Project - Part II"
#student: Kaimi Huang (kh2908)
#objective of research: the objective of the research is to identify factors that relate to high systolic and diastolic blood pressures

library(magrittr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ stringr 1.4.0
## ✓ tidyr   1.2.0     ✓ forcats 0.5.1
## ✓ readr   2.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::extract()   masks magrittr::extract()
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(ggplot2)

#comparison of (two) means t-test

#the first t-test compares the genders' blood pressure averages to see whether there is a significant differences
#load the gender data frame

gender <- read.csv("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project/gender.csv")
dim(gender)
## [1] 11656     9
head(gender)
##   X Respondent.sequence.number. Systolic Diastolic     Reading
## 1 1                      109264      109        67 1st reading
## 2 2                      109266       99        56 1st reading
## 3 3                      109270      123        73 1st reading
## 4 4                      109271      102        65 1st reading
## 5 5                      109273      116        68 1st reading
## 6 6                      109274      138        70 1st reading
##   sys_blood_pressure_level dia_blood_pressure_level gender gender_description
## 1                   normal                   normal      2             female
## 2                   normal                   normal      2             female
## 3                 elevated                   normal      2             female
## 4                   normal                   normal      1               male
## 5                   normal                   normal      1               male
## 6                     high                   normal      1               male
names(gender)
## [1] "X"                           "Respondent.sequence.number."
## [3] "Systolic"                    "Diastolic"                  
## [5] "Reading"                     "sys_blood_pressure_level"   
## [7] "dia_blood_pressure_level"    "gender"                     
## [9] "gender_description"
#for assumption, check that the sample size is greater than 40 for both genders
table(gender$gender_description)
## 
## female   male 
##   5932   5724
#calculate the means and standard deviations of systolic blood pressure by gender
sum_stat_sys_by_gender <- gender %>%
       select(Systolic, gender_description) %>%
       
       #group by gender_description
       group_by(gender_description) %>%
    
       #calculate the means and sds
       summarize(mean_sys = mean(Systolic, na.rm=TRUE), sd_sys = sd(Systolic, na.rm=TRUE))

print(sum_stat_sys_by_gender)
## # A tibble: 2 × 3
##   gender_description mean_sys sd_sys
##   <chr>                 <dbl>  <dbl>
## 1 female                 118.   20.8
## 2 male                   122.   18.8
#calculate the means and standard deviations of diastolic blood pressure by gender
sum_stat_dia_by_gender <- gender %>%
       select(Diastolic, gender_description) %>%
       
       #group by gender_description
       group_by(gender_description) %>%
    
       #calculate the means and sds
       summarize(mean_dia = mean(Diastolic, na.rm=TRUE), sd_dia = sd(Diastolic, na.rm=TRUE))

print(sum_stat_dia_by_gender)
## # A tibble: 2 × 3
##   gender_description mean_dia sd_dia
##   <chr>                 <dbl>  <dbl>
## 1 female                 71.6   12.2
## 2 male                   72.5   12.6
#create a comparative boxplot of median systolic blood pressure by gender
ggplot(gender, aes(Systolic, gender_description)) + geom_boxplot(colour = "blue", fill="light blue", outlier.colour="black", outlier.shape=16, outlier.size=2) + ggtitle("Systolic Blood Pressure by Gender")
## Warning: Removed 1304 rows containing non-finite values (stat_boxplot).

#create a comparative boxplot of median diastolic blood pressure by gender
ggplot(gender, aes(Diastolic, gender_description)) + geom_boxplot(colour = "blue", fill="light blue", outlier.colour="black", outlier.shape=16, outlier.size=2) + ggtitle("Diastolic Blood Pressure by Gender")
## Warning: Removed 1304 rows containing non-finite values (stat_boxplot).

#conduct a comparison of means test for systolic blood pressure
t.test(Systolic~gender_description, data=gender, equal.var=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  Systolic by gender_description
## t = -12.506, df = 10270, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
##  -5.627164 -4.102184
## sample estimates:
## mean in group female   mean in group male 
##             117.5892             122.4539
#conduct a comparison of means test for diastolic blood pressure
t.test(Diastolic~gender_description, data=gender, equal.var=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  Diastolic by gender_description
## t = -3.6193, df = 10332, p-value = 0.0002968
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
##  -1.3613409 -0.4048084
## sample estimates:
## mean in group female   mean in group male 
##             71.59728             72.48035
#comparison of (two) means t-test

#the second t-test compares the blood pressure averages of those who have health insurance and of those who do not to see whether there is a significant differences

#load the insurance data frame
health_insurance <- read.csv("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project/health_insurance.csv")
dim(health_insurance)
## [1] 11629     9
head(health_insurance)
##   X Respondent.sequence.number. Systolic Diastolic     Reading
## 1 1                      109264      109        67 1st reading
## 2 2                      109266       99        56 1st reading
## 3 3                      109270      123        73 1st reading
## 4 4                      109271      102        65 1st reading
## 5 5                      109273      116        68 1st reading
## 6 6                      109274      138        70 1st reading
##   sys_blood_pressure_level dia_blood_pressure_level health.insurance
## 1                   normal                   normal                1
## 2                   normal                   normal                1
## 3                 elevated                   normal                1
## 4                   normal                   normal                1
## 5                   normal                   normal                1
## 6                     high                   normal                1
##   coverd_by_health_insurance
## 1                        yes
## 2                        yes
## 3                        yes
## 4                        yes
## 5                        yes
## 6                        yes
names(health_insurance)
## [1] "X"                           "Respondent.sequence.number."
## [3] "Systolic"                    "Diastolic"                  
## [5] "Reading"                     "sys_blood_pressure_level"   
## [7] "dia_blood_pressure_level"    "health.insurance"           
## [9] "coverd_by_health_insurance"
#for assumption, check that the sample size is greater than 40 for both yes and no groups (answer to the question of whether one is covered by insurance)
table(health_insurance$coverd_by_health_insurance)
## 
##    no   yes 
##  1598 10031
#calculate the means and standard deviations of systolic blood pressure by coverd_by_health_insurance (yes/no)
sum_stat_sys_by_insurance <- health_insurance %>%
       select(Systolic, coverd_by_health_insurance) %>%
       
       #group by coverd_by_health_insurance
       group_by(coverd_by_health_insurance) %>%
    
       #calculate the means and sds
       summarize(mean_sys = mean(Systolic, na.rm=TRUE), sd_sys = sd(Systolic, na.rm=TRUE))

print(sum_stat_sys_by_insurance)
## # A tibble: 2 × 3
##   coverd_by_health_insurance mean_sys sd_sys
##   <chr>                         <dbl>  <dbl>
## 1 no                             121.   19.7
## 2 yes                            120.   20.0
#calculate the means and standard deviations of diastolic blood pressure by coverd_by_health_insurance (yes/no)
sum_stat_dia_by_insurance <- health_insurance %>%
       select(Diastolic, coverd_by_health_insurance) %>%
       
       #group by coverd_by_health_insurance
       group_by(coverd_by_health_insurance) %>%
    
       #calculate the means and sds
       summarize(mean_dia = mean(Diastolic, na.rm=TRUE), sd_dia = sd(Diastolic, na.rm=TRUE))

print(sum_stat_dia_by_insurance)
## # A tibble: 2 × 3
##   coverd_by_health_insurance mean_dia sd_dia
##   <chr>                         <dbl>  <dbl>
## 1 no                             74.5   13.2
## 2 yes                            71.7   12.2
#create a comparative boxplot of median systolic blood pressure by coverd_by_health_insurance
ggplot(health_insurance, aes(Systolic, coverd_by_health_insurance)) + geom_boxplot(colour = "blue", fill="light blue", outlier.colour="black", outlier.shape=16, outlier.size=2) + ggtitle("Systolic Blood Pressure by Insurance")
## Warning: Removed 1302 rows containing non-finite values (stat_boxplot).

#create a comparative boxplot of median diastolic blood pressure by coverd_by_health_insurance
ggplot(health_insurance, aes(Diastolic, coverd_by_health_insurance)) + geom_boxplot(colour = "blue", fill="light blue", outlier.colour="black", outlier.shape=16, outlier.size=2) + ggtitle("Diastolic Blood Pressure by Insurance")
## Warning: Removed 1302 rows containing non-finite values (stat_boxplot).

#conduct a comparison of means test for systolic blood pressure
t.test(Systolic~coverd_by_health_insurance, data=health_insurance, equal.var=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  Systolic by coverd_by_health_insurance
## t = 2.1272, df = 1842.2, p-value = 0.03353
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
##  0.094783 2.334828
## sample estimates:
##  mean in group no mean in group yes 
##          121.0595          119.8447
#conduct a comparison of means test for diastolic blood pressure
t.test(Diastolic~coverd_by_health_insurance, data=health_insurance, equal.var=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  Diastolic by coverd_by_health_insurance
## t = 7.5038, df = 1760.3, p-value = 9.789e-14
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
##  2.101401 3.588650
## sample estimates:
##  mean in group no mean in group yes 
##          74.49746          71.65244
#chi-square test

#the first chi-square test compares blood pressure level proportions by race to see if there is an association between high blood pressure and race

#load the race data frame
race <- read.csv("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project/race.csv")
dim(race)
## [1] 11656     9
head(race)
##   X Respondent.sequence.number. Systolic Diastolic     Reading
## 1 1                      109264      109        67 1st reading
## 2 2                      109266       99        56 1st reading
## 3 3                      109270      123        73 1st reading
## 4 4                      109271      102        65 1st reading
## 5 5                      109273      116        68 1st reading
## 6 6                      109274      138        70 1st reading
##   sys_blood_pressure_level dia_blood_pressure_level race race_description
## 1                   normal                   normal    1 Mexican American
## 2                   normal                   normal    6            Asian
## 3                 elevated                   normal    4            Black
## 4                   normal                   normal    3            White
## 5                   normal                   normal    3            White
## 6                     high                   normal    7       Other Race
names(race)
## [1] "X"                           "Respondent.sequence.number."
## [3] "Systolic"                    "Diastolic"                  
## [5] "Reading"                     "sys_blood_pressure_level"   
## [7] "dia_blood_pressure_level"    "race"                       
## [9] "race_description"
#get rid of the NA values
race <- na.omit(race)

#create tables that are by race and by blood pressure levels 
table_sys <- table(race$sys_blood_pressure_level, race$race_description) 
print(table_sys)
##           
##            Asian Black Mexican American Other Hispanic Other Race White
##   elevated   212   449              222            168         94   620
##   high       267   910              238            269        120   929
##   normal     640  1408              812            575        414  2005
table_dia <- table(race$dia_blood_pressure_level, race$race_description) 
print(table_dia)
##         
##          Asian Black Mexican American Other Hispanic Other Race White
##   high     276   891              247            242        145   848
##   normal   843  1876             1025            770        483  2706
#create bar charts to compare proportions
ggplot(race) + geom_bar(mapping = aes(x=race_description, fill = sys_blood_pressure_level), width=0.95, alpha=0.80) + labs(x="race")

ggplot(race) + geom_bar(mapping = aes(x=race_description, fill = dia_blood_pressure_level), width=0.95, alpha=0.80) + labs(x="race")

#conduct chi-square tests of independence fo systolic blood pressure levels
chisq.test(race$sys_blood_pressure_level, race$race_description)
## 
##  Pearson's Chi-squared test
## 
## data:  race$sys_blood_pressure_level and race$race_description
## X-squared = 131.21, df = 10, p-value < 2.2e-16
#store the test to create a table of the expected counts
Xsq_sys <- chisq.test(race$sys_blood_pressure_level, race$race_description)
Xsq_sys$expected
##                              race$race_description
## race$sys_blood_pressure_level    Asian     Black Mexican American
##                      elevated 190.7878  471.7692         216.8740
##                      high     295.4238  730.5072         335.8168
##                      normal   632.7884 1564.7235         719.3091
##                              race$race_description
## race$sys_blood_pressure_level Other Hispanic Other Race     White
##                      elevated       172.5444   107.0730  605.9515
##                      high           267.1750   165.7964  938.2807
##                      normal         572.2805   355.1306 2009.7678
#conduct chi-square tests of independence fo diastolic blood pressure levels
chisq.test(race$dia_blood_pressure_level, race$race_description)
## 
##  Pearson's Chi-squared test
## 
## data:  race$dia_blood_pressure_level and race$race_description
## X-squared = 98.599, df = 5, p-value < 2.2e-16
#store the test to create a table of the expected counts
Xsq_dia <-chisq.test(race$dia_blood_pressure_level, race$race_description)
Xsq_dia$expected
##                              race$race_description
## race$dia_blood_pressure_level    Asian     Black Mexican American
##                        high   286.3438  708.0548         325.4954
##                        normal 832.6562 2058.9452         946.5046
##                              race$race_description
## race$dia_blood_pressure_level Other Hispanic Other Race     White
##                        high         258.9633   160.7005  909.4422
##                        normal       753.0367   467.2995 2644.5578
#chi-square test

#the second chi-square test compares blood pressure level proportions by education level to see if there is an association between high blood pressures and education

#load the education data frame
education <- read.csv("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project/education.csv")
dim(education)
## [1] 11656     9
head(education)
##   X Respondent.sequence.number. Systolic Diastolic     Reading
## 1 1                      109264      109        67 1st reading
## 2 2                      109266       99        56 1st reading
## 3 3                      109270      123        73 1st reading
## 4 4                      109271      102        65 1st reading
## 5 5                      109273      116        68 1st reading
## 6 6                      109274      138        70 1st reading
##   sys_blood_pressure_level dia_blood_pressure_level education
## 1                   normal                   normal        NA
## 2                   normal                   normal         5
## 3                 elevated                   normal        NA
## 4                   normal                   normal         2
## 5                   normal                   normal         4
## 6                     high                   normal         4
##             education_level
## 1                      <NA>
## 2 College graduate or above
## 3                      <NA>
## 4              9-11th grade
## 5              Some college
## 6              Some college
names(education)
## [1] "X"                           "Respondent.sequence.number."
## [3] "Systolic"                    "Diastolic"                  
## [5] "Reading"                     "sys_blood_pressure_level"   
## [7] "dia_blood_pressure_level"    "education"                  
## [9] "education_level"
#get rid of the NA values
education <- na.omit(education)
dim(education)
## [1] 7648    9
#remove the "Don't Know" and "Refused" columns as there are too few counts in those columns and keeping them may affect the test
condition_1 <- education$education_level != "Don't Know"
table(condition_1)
## condition_1
## FALSE  TRUE 
##     4  7644
education <- education[condition_1,]

condition_2 <- education$education_level != "Refused"
table(condition_2)
## condition_2
## FALSE  TRUE 
##     1  7643
education <- education[condition_2,]
dim(education)
## [1] 7643    9
#create tables that are by education level and by blood pressure levels 
table_sys <- table(education$sys_blood_pressure_level, education$education_level) 
print(table_sys)
##           
##            9-11th grade College graduate or above High school graduate
##   elevated          152                       423                  366
##   high              343                       548                  702
##   normal            338                       930                  790
##           
##            Less than 9th grade Some college
##   elevated                  92          519
##   high                     245          841
##   normal                   181         1173
table_dia <- table(education$dia_blood_pressure_level, education$education_level) 
print(table_dia)
##         
##          9-11th grade College graduate or above High school graduate
##   high            279                       564                  641
##   normal          554                      1337                 1217
##         
##          Less than 9th grade Some college
##   high                   168          892
##   normal                 350         1641
#create bar charts to compare proportions
ggplot(education) + geom_bar(mapping = aes(x=education_level, fill = sys_blood_pressure_level), width=0.95, alpha=0.80) + labs(x="education")

ggplot(education) + geom_bar(mapping = aes(x=education_level, fill = dia_blood_pressure_level), width=0.95, alpha=0.80) + labs(x="education")

#conduct chi-square tests of independence for systolic blood pressure levels
chisq.test(education$sys_blood_pressure_level, education$education_level)
## 
##  Pearson's Chi-squared test
## 
## data:  education$sys_blood_pressure_level and education$education_level
## X-squared = 91.084, df = 8, p-value = 2.802e-16
#store the test to create a table of the expected counts
Xsq_sys <- chisq.test(education$sys_blood_pressure_level, education$education_level)
Xsq_sys$expected
##           education$education_level
##            9-11th grade College graduate or above High school graduate
##   elevated     169.1503                  386.0201             377.2885
##   high         291.9805                  666.3325             651.2602
##   normal       371.8692                  848.6474             829.4513
##           education$education_level
##            Less than 9th grade Some college
##   elevated            105.1859     514.3551
##   high                181.5677     887.8591
##   normal              231.2464    1130.7858
#conduct chi-square tests of independence for diastolic blood pressure levels
chisq.test(education$dia_blood_pressure_level, education$education_level)
## 
##  Pearson's Chi-squared test
## 
## data:  education$dia_blood_pressure_level and education$education_level
## X-squared = 16.865, df = 4, p-value = 0.002053
#store the test to create a table of the expected counts
Xsq_dia <-chisq.test(education$dia_blood_pressure_level, education$education_level)
Xsq_dia$expected
##         education$education_level
##          9-11th grade College graduate or above High school graduate
##   high        277.267                  632.7547              618.442
##   normal      555.733                 1268.2453             1239.558
##         education$education_level
##          Less than 9th grade Some college
##   high              172.4182     843.1181
##   normal            345.5818    1689.8819
#linear regression analysis

#the first linear regression analysis checks whether there is a linear relationship between blood pressures and income

#load the income data frame
income <- read.csv("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project/income.csv")
dim(income)
## [1] 11656     8
head(income)
##   X Respondent.sequence.number. Systolic Diastolic     Reading
## 1 1                      109264      109        67 1st reading
## 2 2                      109266       99        56 1st reading
## 3 3                      109270      123        73 1st reading
## 4 4                      109271      102        65 1st reading
## 5 5                      109273      116        68 1st reading
## 6 6                      109274      138        70 1st reading
##   sys_blood_pressure_level dia_blood_pressure_level poverty_level_index
## 1                   normal                   normal                1.29
## 2                   normal                   normal                5.00
## 3                 elevated                   normal                1.69
## 4                   normal                   normal                1.20
## 5                   normal                   normal                0.53
## 6                     high                   normal                1.20
names(income)
## [1] "X"                           "Respondent.sequence.number."
## [3] "Systolic"                    "Diastolic"                  
## [5] "Reading"                     "sys_blood_pressure_level"   
## [7] "dia_blood_pressure_level"    "poverty_level_index"
#get rid of the NA values
income <- na.omit(income)

#plot the variables to create scatter plots
ggplot(income, aes(poverty_level_index, Systolic)) + geom_point(color = "hotpink", na.rm = TRUE) 

ggplot(income, aes(poverty_level_index, Diastolic)) + geom_point(color = "aquamarine4", na.rm = TRUE)

#calculate the correlations between systolic, diastolic blood pressures and income
print(cor(income$poverty_level_index,income$Systolic))
## [1] 0.03808412
print(cor(income$poverty_level_index,income$Diastolic))
## [1] 0.03680397
#create models
sys_inc_model <- lm(Systolic ~ poverty_level_index, data = income)
print(sys_inc_model)
## 
## Call:
## lm(formula = Systolic ~ poverty_level_index, data = income)
## 
## Coefficients:
##         (Intercept)  poverty_level_index  
##            118.4145               0.4861
dia_inc_model <- lm(Diastolic ~ poverty_level_index, data = income)
print(dia_inc_model)
## 
## Call:
## lm(formula = Diastolic ~ poverty_level_index, data = income)
## 
## Coefficients:
##         (Intercept)  poverty_level_index  
##              71.150                0.294
#calculate and plot the residuals
sys_residuals<-resid(sys_inc_model)
hist(sys_residuals)

dia_residuals<-resid(dia_inc_model)
hist(dia_residuals)

#Normal probability plots of the residuals
qqnorm(sys_residuals)

qqnorm(dia_residuals)

#Calculate standard residuals
sys_standard_residuals <- rstandard(sys_inc_model)
length(sys_standard_residuals)
## [1] 8345
dia_standard_residuals <- rstandard(dia_inc_model)
length(dia_standard_residuals)
## [1] 8345
dim(income)
## [1] 8345    8
#plot the standard residuals
hist(sys_standard_residuals)

hist(dia_standard_residuals)

#Create standardized residual plots
plot(fitted(sys_inc_model), sys_standard_residuals)

plot(fitted(dia_inc_model), dia_standard_residuals)

#Combine regression variables with standard residuals

sys_income_stdres <- cbind(income, sys_standard_residuals)
dim(sys_income_stdres)
## [1] 8345    9
dia_income_stdres <- cbind(income, dia_standard_residuals)
dim(dia_income_stdres)
## [1] 8345    9
#Remove absolute values of standard residuals that are greater than 2.
sys_income_analysis <- sys_income_stdres[abs(sys_income_stdres$sys_standard_residuals) < 2,]
dim(sys_income_analysis)
## [1] 7938    9
dia_income_analysis <- dia_income_stdres[abs(dia_income_stdres$dia_standard_residuals) < 2,]
dim(dia_income_analysis)
## [1] 7981    9
#histogram of the standard residuals after removing outliers
hist(sys_income_analysis$sys_standard_residuals)

hist(dia_income_analysis$dia_standard_residuals)

#fit the least squares (regression) line to the scatter plots that are without outliers
ggplot(sys_income_analysis, aes(poverty_level_index, Systolic)) + geom_point(color = "hotpink", na.rm = TRUE) + geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(dia_income_analysis, aes(poverty_level_index, Diastolic)) + geom_point(color = "aquamarine4", na.rm = TRUE) + geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

#perform linear regression analyses
summary(lm(Systolic ~ poverty_level_index, data = sys_income_analysis))
## 
## Call:
## lm(formula = Systolic ~ poverty_level_index, data = sys_income_analysis)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.823 -11.863  -2.114  10.445  42.308 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         115.4656     0.3200 360.848  < 2e-16 ***
## poverty_level_index   0.7297     0.1153   6.331 2.56e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.02 on 7936 degrees of freedom
## Multiple R-squared:  0.005026,   Adjusted R-squared:  0.004901 
## F-statistic: 40.09 on 1 and 7936 DF,  p-value: 2.561e-10
summary(lm(Diastolic ~ poverty_level_index, data = dia_income_analysis))
## 
## Call:
## lm(formula = Diastolic ~ poverty_level_index, data = dia_income_analysis)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.9257  -7.9257  -0.5485   7.5868  25.8428 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         69.96940    0.21140 330.978  < 2e-16 ***
## poverty_level_index  0.39127    0.07652   5.113 3.24e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.6 on 7979 degrees of freedom
## Multiple R-squared:  0.003266,   Adjusted R-squared:  0.003141 
## F-statistic: 26.15 on 1 and 7979 DF,  p-value: 3.236e-07
#the next linear regression analysis checks whether there is a linear relationship between blood pressures and sleep

#load the sleep data frame
sleep <- read.csv("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project/sleep.csv")
dim(sleep)
## [1] 11656     8
head(sleep)
##   X Respondent.sequence.number. Systolic Diastolic     Reading
## 1 1                      109264      109        67 1st reading
## 2 2                      109266       99        56 1st reading
## 3 3                      109270      123        73 1st reading
## 4 4                      109271      102        65 1st reading
## 5 5                      109273      116        68 1st reading
## 6 6                      109274      138        70 1st reading
##   sys_blood_pressure_level dia_blood_pressure_level workday_sleep_hrs
## 1                   normal                   normal                NA
## 2                   normal                   normal               7.5
## 3                 elevated                   normal                NA
## 4                   normal                   normal              10.0
## 5                   normal                   normal               6.5
## 6                     high                   normal               9.5
names(sleep)
## [1] "X"                           "Respondent.sequence.number."
## [3] "Systolic"                    "Diastolic"                  
## [5] "Reading"                     "sys_blood_pressure_level"   
## [7] "dia_blood_pressure_level"    "workday_sleep_hrs"
#get rid of the NA values
sleep <- na.omit(sleep)

#plot the variables to create scatter plots
ggplot(sleep, aes(workday_sleep_hrs, Systolic)) + geom_point(color = "hotpink", na.rm = TRUE) 

ggplot(sleep, aes(workday_sleep_hrs, Diastolic)) + geom_point(color = "aquamarine4", na.rm = TRUE)

#calculate the correlations between systolic, diastolic blood pressures and sleep
print(cor(sleep$workday_sleep_hrs,sleep$Systolic))
## [1] -0.02701154
print(cor(sleep$workday_sleep_hrs,sleep$Diastolic))
## [1] -0.06131704
#create models
sys_sleep_model <- lm(Systolic ~ workday_sleep_hrs, data = sleep)
print(sys_sleep_model)
## 
## Call:
## lm(formula = Systolic ~ workday_sleep_hrs, data = sleep)
## 
## Coefficients:
##       (Intercept)  workday_sleep_hrs  
##          126.0649            -0.3178
dia_sleep_model <- lm(Diastolic ~ workday_sleep_hrs, data = sleep)
print(dia_sleep_model)
## 
## Call:
## lm(formula = Diastolic ~ workday_sleep_hrs, data = sleep)
## 
## Coefficients:
##       (Intercept)  workday_sleep_hrs  
##           77.6682            -0.4367
#calculate and plot the residuals
sys_residuals<-resid(sys_sleep_model)
hist(sys_residuals)

dia_residuals<-resid(dia_sleep_model)
hist(dia_residuals)

#Normal probability plots of the residuals
qqnorm(sys_residuals)

qqnorm(dia_residuals)

#Calculate standard residuals
sys_standard_residuals <- rstandard(sys_sleep_model)
length(sys_standard_residuals)
## [1] 8391
dia_standard_residuals <- rstandard(dia_sleep_model)
length(dia_standard_residuals)
## [1] 8391
dim(sleep)
## [1] 8391    8
#plot the standard residuals
hist(sys_standard_residuals)

hist(dia_standard_residuals)

#Create standardized residual plots
plot(fitted(sys_sleep_model), sys_standard_residuals)

plot(fitted(dia_sleep_model), dia_standard_residuals)

#Combine regression variables with standard residuals
sys_sleep_stdres <- cbind(sleep, sys_standard_residuals)
dim(sys_sleep_stdres)
## [1] 8391    9
dia_sleep_stdres <- cbind(sleep, dia_standard_residuals)
dim(dia_sleep_stdres)
## [1] 8391    9
#Remove absolute values of standard residuals that are greater than 2.
sys_sleep_analysis <- sys_sleep_stdres[abs(sys_sleep_stdres$sys_standard_residuals) < 2,]
dim(sys_sleep_analysis)
## [1] 7998    9
dia_sleep_analysis <- dia_sleep_stdres[abs(dia_sleep_stdres$dia_standard_residuals) < 2,]
dim(dia_sleep_analysis)
## [1] 8019    9
#histogram of the standard residuals after removing outliers
hist(sys_sleep_analysis$sys_standard_residuals)

hist(dia_sleep_analysis$dia_standard_residuals)

#fit the least squares (regression) line to the scatter plots that are without outliers
ggplot(sys_sleep_analysis, aes(workday_sleep_hrs, Systolic)) + geom_point(color = "hotpink", na.rm = TRUE) + geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(dia_sleep_analysis, aes(workday_sleep_hrs, Diastolic)) + geom_point(color = "aquamarine4", na.rm = TRUE) + geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

#perform linear regression analyses
summary(lm(Systolic ~ workday_sleep_hrs, data = sys_sleep_analysis))
## 
## Call:
## lm(formula = Systolic ~ workday_sleep_hrs, data = sys_sleep_analysis)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.793 -11.965  -1.879  10.465  41.584 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       124.0730     0.8408 147.561  < 2e-16 ***
## workday_sleep_hrs  -0.3657     0.1081  -3.381 0.000725 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.06 on 7996 degrees of freedom
## Multiple R-squared:  0.001428,   Adjusted R-squared:  0.001303 
## F-statistic: 11.43 on 1 and 7996 DF,  p-value: 0.0007251
summary(lm(Diastolic ~ workday_sleep_hrs, data = dia_sleep_analysis))
## 
## Call:
## lm(formula = Diastolic ~ workday_sleep_hrs, data = dia_sleep_analysis)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.9057  -7.4218  -0.4218   7.0943  24.3847 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       77.51872    0.52905 146.524  < 2e-16 ***
## workday_sleep_hrs -0.51615    0.06807  -7.583 3.76e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.08 on 8017 degrees of freedom
## Multiple R-squared:  0.007121,   Adjusted R-squared:  0.006997 
## F-statistic:  57.5 on 1 and 8017 DF,  p-value: 3.76e-14